This is a quick tutorial to get started with scikit-learn.

Parts of the code presented are based on this machineLearning tutorial.

First, let's take a look at the versions of the libraries involved.


In [1]:
import numpy;      print('numpy:\t', numpy.__version__, sep='\t')
import scipy;      print('scipy:\t', scipy.__version__, sep='\t')
import matplotlib; print('matplotlib:', matplotlib.__version__, sep='\t')
import sklearn;    print('scikit-learn:', sklearn.__version__, sep='\t')


numpy:		1.10.4
scipy:		0.17.0
matplotlib:	1.5.1
scikit-learn:	0.17.1

Then load some data.


In [2]:
from sklearn import datasets

#datasets.load_ -> [press tab for completion]
iris = datasets.load_iris()
iris.keys()


Out[2]:
dict_keys(['target', 'target_names', 'feature_names', 'data', 'DESCR'])

In [3]:
for k in iris.keys():
    print('\n== ', k, '==\n', str(iris[k])[0:390])


==  target ==
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

==  target_names ==
 ['setosa' 'versicolor' 'virginica']

==  feature_names ==
 ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

==  data ==
 [[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1.4  0.3]
 [ 5.   3.4  1.5  0.2]
 [ 4.4  2.9  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 5.4  3.7  1.5  0.2]
 [ 4.8  3.4  1.6  0.2]
 [ 4.8  3.   1.4  0.1]
 [ 4.3  3.   1.1  0.1]
 [ 5.8  4.   1.2  0.2]
 [ 5.7  4.4  1.5  0.4]
 [ 5.4  3.9  1.3  0.4]

==  DESCR ==
 Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
       

In [4]:
for k in iris.keys():
    print(k, ':', type(iris[k]))


target : <class 'numpy.ndarray'>
target_names : <class 'numpy.ndarray'>
feature_names : <class 'list'>
data : <class 'numpy.ndarray'>
DESCR : <class 'str'>

In [5]:
[(k, iris[k].shape) for k in iris.keys() if type(iris[k]) == numpy.ndarray]


Out[5]:
[('target', (150,)), ('target_names', (3,)), ('data', (150, 4))]

In [6]:
# note: this also imports numpy as np, imports matplotlib.pyplot as plt, and others
%pylab inline


Populating the interactive namespace from numpy and matplotlib

Benchmark classificator by ml-benchmarks:


In [7]:
def dtime_to_seconds(dtime):
    return dtime.seconds + (dtime.microseconds * 1e-6)

def bench(func, data, n=10):
    assert n > 2
    score = np.inf
    try:
        time = []
        for i in range(n):
            score, t = func(*data)
            time.append(dtime_to_seconds(t))
        # remove extremal values
        time.pop(np.argmax(time))
        time.pop(np.argmin(time))
    except Exception as detail:
        print('%s error in function %s: ', (repr(detail), func))
        time = []
    return score, np.array(time)

def bench_skl(X, y, T, valid):
    from sklearn import linear_model, ensemble
    start = datetime.now()
    # http://scikit-learn.org/stable/modules/classes.html
    clf = ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=5, verbose=0)
    #clf = linear_model.ElasticNet(alpha=0.5, l1_ratio=0.5)
    #clf = linear_model.LogisticRegression()
    #clf = neighbors.NeighborsClassifier(n_neighbors=n_neighbors, algorithm='brute_inplace')
    #clf = skl_cluster.KMeans(k=n_components, n_init=1)
    #...
    clf.fit(X, y)

    ## Regression
    # pred = clf.predict(T)
    # delta = datetime.now() - start
    # mse = np.linalg.norm(pred - valid, 2) ** 2
    # return mse, delta

    # Classification
    score = np.mean(clf.predict(T) == valid)
    return score, datetime.now() - start

from sklearn import datasets
import numpy as np
from datetime import datetime

iris = datasets.load_iris()

sample_range = np.random.random_sample(size=iris.target.shape[0])
TH = 0.7

X = np.array([(iris.data[i,]) for i in range(len(iris.target)) if sample_range[i] >= TH])
Y = np.array([(iris.target[i,]) for i in range(len(iris.target)) if sample_range[i] >= TH])
T = np.array([(iris.data[i,]) for i in range(len(iris.target)) if sample_range[i] < TH])
valid = np.array([(iris.target[i,]) for i in range(len(iris.target)) if sample_range[i] < TH])

num_tries = 25
score, times = bench(bench_skl, (X,Y,T,valid), num_tries)
print('Tries:', num_tries, 'Score:', score, 'Time:', np.mean(times), '(mean)', np.median(times), '(median)')


Tries: 25 Score: 0.945945945946 Time: 0.850540608696 (mean) 0.826801 (median)